#!pip install -q pycountry
#!pip install gensim
#!pip install textblob
#!pip install wordcloud
#!pip install plotly
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
import math
import numpy as np
import scipy as sp
import pandas as pd
import pycountry
from sklearn import metrics
from sklearn.utils import shuffle
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import nltk
from textblob import TextBlob
from wordcloud import WordCloud
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import random
import networkx as nx
from pandas import Timestamp
import requests
from IPython.display import HTML
import seaborn as sns
from tqdm import tqdm
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
tqdm.pandas()
np.random.seed(0)
%env PYTHONHASHSEED=0
import warnings
warnings.filterwarnings("ignore")
biorxiv_df = pd.read_csv('D:/GSUCoursework/BigDataExp/Datasets/cleaned csv/biorxiv_clean.csv')
pmc_df = pd.read_csv('D:/GSUCoursework/BigDataExp/Datasets/cleaned csv/clean_pmc.csv')
comm_use_df = pd.read_csv('D:/GSUCoursework/BigDataExp/Datasets/cleaned csv/clean_comm_use.csv')
noncomm_use_df = pd.read_csv('D:/GSUCoursework/BigDataExp/Datasets/cleaned csv/clean_noncomm_use.csv')
papers_df = pd.concat([pmc_df,
biorxiv_df,
comm_use_df,
noncomm_use_df], axis=0).reset_index(drop=True)
papers_df
papers_df['authors']
full_table = pd.read_csv('D:/GSUCoursework/BigDataExp/Datasets/Covid_19_clean/Covid_19_latest/covid_19_clean_complete.csv')
full_table.head(10)
# Converting Date column to datetime datatype
full_table.dtypes
full_table['Date'] = pd.to_datetime(full_table['Date'])
full_table.dtypes
# Checking for null values
full_table.isna().sum()
# filling missing values
full_table['Province/State'] = full_table['Province/State'].fillna('')
full_table.isna().sum()
# replacing Mainland china with just China
full_table['Country/Region'] = full_table['Country/Region'].replace('Mainland China', 'China')
# Creating a new column 'Active' which will represent all the present active cases
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']
full_table.head(10)
# Cases in the ships
ship = full_table[full_table['Province/State'].str.contains('Grand Princess')|full_table['Country/Region'].str.contains('Cruise Ship')]
# Let us seperate out only China data into variable China and all other countries_Region into variable row
# china and the row
china = full_table[full_table['Country/Region']=='China']
row = full_table[full_table['Country/Region']!='China']
# latest cases
full_latest = full_table[full_table['Date'] == max(full_table['Date'])].reset_index()
china_latest = full_latest[full_latest['Country/Region']=='China']
row_latest = full_latest[full_latest['Country/Region']!='China']
# latest condensed
full_latest_grouped = full_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
china_latest_grouped = china_latest.groupby('Province/State')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
row_latest_grouped = row_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
# Creating a consolidated table , which gives the country wise total defined cases
temp = full_table.groupby(['Country/Region', 'Province/State'])['Confirmed', 'Deaths', 'Recovered', 'Active'].max()
temp = full_table.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
temp.style.background_gradient(cmap='Pastel1')
countries = full_table['Country/Region'].unique().tolist()
print(countries)
print("\nTotal countries affected by virus: ",len(countries))
1) Saves time when initially exploring your dataset
2) Makes it easy to modify and export your plot
3) Offers a more ornate visualization, which is well-suited for conveying the important insights hidden within your dataset
'''A Function To Plot Pie Plot using Plotly'''
def pie_plot(cnt_srs, colors, title):
labels=cnt_srs.index
values=cnt_srs.values
trace = go.Pie(labels=labels,
values=values,
title=title,
hoverinfo='percent+value',
textinfo='percent',
textposition='inside',
hole=0.7,
showlegend=True,
marker=dict(colors=colors,
line=dict(color='#000000',
width=2),
)
)
return trace
'''Plotly visualization'''
import plotly.offline as py
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
py.init_notebook_mode(connected = True) # Required to use plotly offline in jupyter notebook
py.iplot([pie_plot(full_table['Country/Region'].value_counts(), ['cyan', 'gold'], 'Country')])
# World wide
temp = full_table[full_table['Date'] == max(full_table['Date'])]
m = folium.Map(location=[0, 0], tiles='cartodbpositron',
min_zoom=1, max_zoom=4, zoom_start=1)
for i in range(0, len(temp)):
folium.Circle(
location=[temp.iloc[i]['Lat'], temp.iloc[i]['Long']],
color='crimson', fill='crimson',
tooltip = '<li><bold>Country : '+str(temp.iloc[i]['Country/Region'])+
'<li><bold>Province : '+str(temp.iloc[i]['Province/State'])+
'<li><bold>Confirmed : '+str(temp.iloc[i]['Confirmed'])+
'<li><bold>Deaths : '+str(temp.iloc[i]['Deaths']),
radius=int(temp.iloc[i]['Confirmed'])**1.1).add_to(m)
m
full_grouped = full_table.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
# Over the time
fig = px.choropleth(full_grouped, locations="Country/Region", locationmode='country names', color=np.log(full_grouped["Confirmed"]),
hover_name="Country/Region", animation_frame=full_grouped["Date"].dt.strftime('%Y-%m-%d'),
title='Cases over time', color_continuous_scale=px.colors.sequential.Magenta)
fig.update(layout_coloraxis_showscale=False)
fig.show()
top = full_table[full_table['Date'] == full_table['Date'].max()]
top_casualities = top.groupby(by = 'Country/Region')['Confirmed'].sum().sort_values(ascending = False).head(20).reset_index()
top_casualities
plt.figure(figsize= (15,10))
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Total cases",fontsize = 30)
plt.ylabel('Country',fontsize = 30)
plt.title("Top 20 countries having most confirmed cases" , fontsize = 30)
ax = sns.barplot(x = top_casualities['Confirmed'], y = top_casualities['Country/Region'])
for i, (value, name) in enumerate(zip(top_casualities['Confirmed'],top_casualities['Country/Region'])):
ax.text(value, i-.05, f'{value:,.0f}', size=10, ha='left', va='center')
ax.set(xlabel='Total cases', ylabel='Country')
Observations :
1) China was leading this from many days, but now they are controlling the pandemic spread.
2) The number of confirmed cases are on a high in the US, Italy, Spain, and France.
3) But the number of cases in the third world countries is less.
top_actives = top.groupby(by = 'Country/Region')['Active'].sum().sort_values(ascending = False).head(20).reset_index()
top_actives
plt.figure(figsize= (15,10))
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Total cases",fontsize = 30)
plt.ylabel('Country',fontsize = 30)
plt.title("Top 20 countries having most active cases" , fontsize = 30)
ax = sns.barplot(x = top_actives['Active'], y = top_actives['Country/Region'])
for i, (value, name) in enumerate(zip(top_actives['Active'], top_actives['Country/Region'])):
ax.text(value, i-.05, f'{value:,.0f}', size=10, ha='left', va='center')
ax.set(xlabel='Total cases', ylabel='Country')
Observations :
1) As the covid-19 testing is increasing, The active number of cases is also increasing day by day.
2) The number of active cases is on a high in the US, Italy, Spain, and France.
3) In comparison with the total population, Italy, Spain, Germany and France are the worst-hit nation
top_deaths = top.groupby(by = 'Country/Region')['Deaths'].sum().sort_values(ascending = False).head(20).reset_index()
top_deaths
plt.figure(figsize= (15,10))
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Total cases",fontsize = 30)
plt.ylabel('Country',fontsize = 30)
plt.title("Top 20 countries having most deaths" , fontsize = 30)
ax = sns.barplot(x = top_deaths['Deaths'], y = top_deaths['Country/Region'])
for i, (value, name) in enumerate(zip(top_deaths['Deaths'],top_deaths['Country/Region'])):
ax.text(value, i-.05, f'{value:,.0f}', size=10, ha='left', va='center')
ax.set(xlabel='Total cases', ylabel='Country')
Observations :
1) Even though Italy has the 2nd best healthcare system according to the WHO, they haven't been able to tackle the pandemic problem effectively.
2) China even having so many confirmed cases was able to decrease the number of deaths
3) The number of deaths is also on a rise, especially in Italy, Spain, and Iran
top_recovered = top.groupby(by = 'Country/Region')['Recovered'].sum().sort_values(ascending = False).head(20).reset_index()
top_recovered
plt.figure(figsize= (15,10))
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlabel("Total cases",fontsize = 30)
plt.ylabel('Country',fontsize = 30)
plt.title("Top 20 countries having most recovered cases" , fontsize = 30)
ax = sns.barplot(x = top_recovered['Recovered'], y = top_recovered['Country/Region'])
for i, (value, name) in enumerate(zip(top_recovered['Recovered'],top_recovered['Country/Region'])):
ax.text(value, i-.05, f'{value:,.0f}', size=10, ha='left', va='center')
ax.set(xlabel='Total cases', ylabel='Country')
1) By far China was leading in the number of recoveries even though having a huge number of confirmed cases, but recently Germany has surpassed China in terms of most recovered cases. No wonder Germany has best healthcare facilities
2) Italy, Iran, and Spain are also doing a good job.
3) We have to pump up these numbers for a promising future!
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/Namesor.PNG', width = 500, height = 500)
#!pip install namsor-client
from namsorclient import NamsorClient
from namsorclient.country_codes import CountryCodes
from namsorclient.request_objects import *
# Create an instance of NamsorClient and pass in your API key as an argument.
client = NamsorClient("3ab91fa7404d5bfa2dcd4a183f037cc7")
# Access the origin (GET) endpoint with function that returns a response of type Country of origin.
response = client.origin("Lelouch","Lamperouge")
# Access the different parts of the response for this particular endpoint.
print(response.ID)
print(response.first_name)
print(response.last_name)
print(response.country_origin)
#print(response.gender_scale)
print(response.score)
#print(response.probability_calibrated)
#response = client.origin(repl[0], repl[1])
#repls.append(response.country_origin)
# Refer to the Responses section to view all different variables of each different Response.
# Access the genderBatch (POST) endpoint
origin_batch = OriginBatch()
# Use classify function with required API Key argument and receive responses in the form of a list.
response_list = origin_batch.classify("3ab91fa7404d5bfa2dcd4a183f037cc7")
#print(response_list[2].country_origin)
#client = NamsorClient("3ab91fa7404d5bfa2dcd4a183f037cc7")
def multiple_replace(dict, text):
regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
def get_countries(raw_authors):
alphabet = ["A", "B", "C", "D", "E", "F",\
"G", "H", "I", "J", "K", "L",\
"M", "N", "O", "P", "Q", "R",\
"S", "T", "U", "V", "W", "X",\
"Y", "Z"]
repl_dict = dict(zip([a+" " for a in alphabet], [""]*26))
repls = []
for name in names.split(", "):
repl = multiple_replace(repl_dict, name.strip().replace(") ", "").replace("( ", ""))
if len(repl.split()) == 1:
repl = name[0] + " " + repl
repl = repl.replace(";", "").replace(":", "").replace(".", "").replace(",", "")
repl = repl.split(" ")
for idx in range(len(repl)):
if len(repl[idx]) <= 1 and repl[idx] not in alphabet:
repl[idx] = "A"
response = client.origin(repl[0], repl[1])
repls.append(response.country_origin)
return repls
origin_batch.export_to_excel("countries.xlsx")
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/Word2Vec_Basic.PNG', width = 700, height = 800)
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/Wrod2Vec_Relationships.PNG', width = 800, height = 700)
We can take advantage of these intricate relationships between word vectors to find cures for COVID-19. The steps are as follows:
Step 1 - Find common related to the study of COVID-19, such as "infection", "CoV", "viral", etc.
Step 2 - Find the words with lowest Euclidean distance to these words (most similar words).
Step 3 - Finally, find the words most similar to these words (second order similarity). These words will hopefully contain potential COVID-19 cures.
Note that the similarity between two Word2Vec vectors is calculated using the formula below (where u and v are the word vectors).
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/Word2Vec_Formula.PNG', width = 500, height = 500)
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/NLP_three_Steps.PNG', width = 500, height = 500)
The approach detailed above is actually inspired by a research paper called "Unsupervised word embeddings capture latent knowledge from materials science literature", where the authors find new materials with desirable properties (such as thermoelectricity) solely based on a large corpus materials science literature. These materials were never used for these purposes before, but they outperform old materials by a large margin. I hope to emulate the same method to look for COVID-19 cures. The diagram below illustrates what the authors did in their research.
from IPython.display import Image
Image(filename = 'D:/GSUCoursework/BigDataExp/Datasets/Pictures/Thermoelectricity.PNG', width = 500, height = 500)
In the diagram above, we can see that the authors found two levels of words similar to "thermoelectric" in a heirarchical manner. The second order similar words contained compounds like Li2CuSb, Cu7Te5, and CsAgGa2Se4, which turned out to be very good thermoelectric materials in real life.
def nonan(x):
if type(x) == str:
return x.replace("\n", "")
else:
return ""
text = ' '.join([nonan(abstract) for abstract in papers_df["abstract"]])
wordcloud = WordCloud(max_font_size=None, background_color='white', collocations=False,
width=1200, height=1000).generate(text)
fig = px.imshow(wordcloud)
fig.update_layout(title_text='Common words in abstracts')